import pandas as pd# one-time conversion on your machineeda = pd.read_csv("data/eda_data.csv")eda.to_parquet("data/eda.parquet", index=False)eda = pd.read_parquet("data/eda.parquet")
---------------------------------------------------------------------------FileNotFoundError Traceback (most recent call last)
Cell In[1], line 4 1importpandasaspd 3# one-time conversion on your machine----> 4 eda =pd.read_csv("data/eda_data.csv") 5 eda.to_parquet("data/eda.parquet", index=False)
6 eda = pd.read_parquet("data/eda.parquet")
File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...) 1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026return_read(filepath_or_buffer,kwds)
File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds) 617 _validate_names(kwds.get("names", None))
619# Create the parser.--> 620 parser =TextFileReader(filepath_or_buffer,**kwds) 622if chunksize or iterator:
623return parser
File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds) 1617self.options["has_index_names"] = kwds["has_index_names"]
1619self.handles: IOHandles |None=None-> 1620self._engine =self._make_engine(f,self.engine)
File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine) 1878if"b"notin mode:
1879 mode +="b"-> 1880self.handles =get_handle( 1881f, 1882mode, 1883encoding=self.options.get("encoding",None), 1884compression=self.options.get("compression",None), 1885memory_map=self.options.get("memory_map",False), 1886is_text=is_text, 1887errors=self.options.get("encoding_errors","strict"), 1888storage_options=self.options.get("storage_options",None), 1889) 1890assertself.handles isnotNone 1891 f =self.handles.handle
File /opt/hostedtoolcache/Python/3.11.12/x64/lib/python3.11/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 868elifisinstance(handle, str):
869# Check whether the filename is to be opened in binary mode. 870# Binary mode does not support 'encoding' and 'newline'. 871if ioargs.encoding and"b"notin ioargs.mode:
872# Encoding--> 873 handle =open( 874handle, 875ioargs.mode, 876encoding=ioargs.encoding, 877errors=errors, 878newline="", 879) 880else:
881# Binary mode 882 handle =open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: 'data/eda_data.csv'
Code
# identifying data analyst jobs by keyword searchingkeywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning','Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst''LLM', 'Language Model', 'NLP', 'Natural Language Processing','Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']match =lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \| match('SKILLS_NAME') \| match('SPECIALIZED_SKILLS_NAME') eda['DATA_ANALYST_JOB'].value_counts()
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[2], line 9 2 keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
3'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst' 4'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
5'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
7 match =lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
----> 9 eda['DATA_ANALYST_JOB'] =match('TITLE_NAME') \
10| match('SKILLS_NAME') \
11| match('SPECIALIZED_SKILLS_NAME')
12 eda['DATA_ANALYST_JOB'].value_counts()
Cell In[2], line 7, in <lambda>(col) 1# identifying data analyst jobs by keyword searching 2 keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
3'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst' 4'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
5'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
----> 7 match =lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
9 eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
10| match('SKILLS_NAME') \
11| match('SPECIALIZED_SKILLS_NAME')
12 eda['DATA_ANALYST_JOB'].value_counts()
NameError: name 'eda' is not defined
Code
import pandas as pdimport plotly.graph_objects as gofrom plotly.subplots import make_subplotsdf_grouped = ( eda .groupby(['DATA_ANALYST_JOB','NAICS2_NAME']) .size() .reset_index(name='Job_Count'))short_names = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})pivot = ( df_grouped .pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0) .reset_index())industries = pivot['Industry'].tolist()y_true = pivot['True'].tolist()y_false = pivot['False'].tolist()# -----------------------------------------------------------------------------# 2) Build a 2-row subplot: bar on top, table below# -----------------------------------------------------------------------------fig = make_subplots( rows=2, cols=1, row_heights=[0.70, 0.30], # give a bit more room to the table specs=[[{"type":"bar"}],[{"type":"table"}]], vertical_spacing=0.12# more space between bar and table)colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}fig.add_trace( go.Bar( x=industries, y=y_true, name='True', marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)), text=y_true, textposition='outside' ), row=1, col=1)fig.add_trace( go.Bar( x=industries, y=y_false, name='False', marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)), text=y_false, textposition='outside' ), row=1, col=1)fig.add_trace( go.Table( header=dict( values=["Industry","True","False"], fill_color='#FDEDEC', align='left', font=dict(color='#A81D1D', size=13), height=30 ), cells=dict( values=[industries, y_true, y_false], fill_color='white', align='left', font=dict(color='#333', size=11), height=22 ) ), row=2, col=1)# -----------------------------------------------------------------------------# 3) Slider steps: 0 → 8 000 in 200s# -----------------------------------------------------------------------------steps = []for val inrange(0, 8001, 200): steps.append(dict( label=str(val), method="update", args=[ {"y": [ [v if v>=val else0for v in y_true], [v if v>=val else0for v in y_false] ]}, {"title": f"Min Jobs ≥ {val:,}"} ] ))# -----------------------------------------------------------------------------# 4) Final layout tweaks# -----------------------------------------------------------------------------fig.update_layout(# lift slider above everything sliders=[dict( active=0, currentvalue={"prefix":"Min Jobs: "}, pad={"b":0}, x=0.15, y=1.18, # move slider way above the plot area xanchor="left", yanchor="bottom",len=0.7, font=dict(color='#A81D1D'), steps=steps )], title=dict( text="Data & Business Analytics Job Trends", font=dict(size=24, color='#A81D1D'), x=0.5, y=0.92, # drop the title just below the slider xanchor="center", yanchor="top" ), width=1100, height=850, margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin plot_bgcolor='white', paper_bgcolor='white', xaxis=dict( title="Industry", title_font=dict(size=16, color='#A81D1D'), tickmode='array', tickvals=list(range(len(industries))), ticktext=industries, tickangle=-30, tickfont=dict(size=11, color='#333'), showline=True, linecolor='#A81D1D' ), yaxis=dict( title="Number of Jobs", title_font=dict(size=16, color='#A81D1D'), tickfont=dict(size=11, color='#333'), gridcolor='rgba(200,200,200,0.3)', showline=True, linecolor='#A81D1D',range=[0, max(max(y_true),max(y_false))*1.2] ), legend=dict( title="Data Analyst Job", title_font=dict(color='#A81D1D'), font=dict(size=12), x=1.02, y=0.5 ), bargap=0.2)fig.write_html("figures/edaplot1.html", include_plotlyjs="cdn", full_html=False)
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[3], line 6 2importplotly.graph_objectsasgo 3fromplotly.subplotsimport make_subplots
5 df_grouped = (
----> 6eda 7.groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
8.size()
9.reset_index(name='Job_Count')
10 )
12 short_names = {
13'Professional, Scientific, and Technical Services': 'Prof. Services',
14'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
(...) 22'Other Services (except Public Administration)': 'Other Services' 23 }
24 df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
NameError: name 'eda' is not defined
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Create the box plotfig = px.box(df, x='REMOTE_TYPE_NAME', y='SALARY', color='Job_Category', title='Salary Distribution by Remote Type for Analytics vs Non-Analytics Jobs', labels={'REMOTE_TYPE_NAME': 'Remote Type', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'}, color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Remote Type", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True ), yaxis=dict( title="Salary ($)", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))fig.write_html("figures/edaplot2.html", include_plotlyjs="cdn", full_html=False)
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[4], line 5 2importpandasaspd 4# Prepare the data----> 5 df =eda.copy()
7# Define analytics jobs (Data Analyst + Business Analyst) 8defclassify_analytics_job(row):
NameError: name 'eda' is not defined
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Group by industry and job categorydf_grouped = df.groupby(['NAICS2_NAME', 'IS_ANALYTICS_JOB']).size().reset_index(name='Job_Count')df_grouped['Job_Category'] = df_grouped['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Shorten industry names for better readabilityshort_map = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])# Create the stacked bar chartfig = px.bar(df_grouped, x='Industry', y='Job_Count', color='Job_Category', title='Top Industries Hiring Analytics Jobs', labels={'Industry': 'Industry', 'Job_Count': 'Number of Jobs', 'Job_Category': 'Job Category'}, barmode='stack', color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=1000, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Industry", title_font=dict(size=16), tickfont=dict(size=12), tickangle=-45, gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True ), yaxis=dict( title="Number of Jobs", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))fig.write_html("figures/edaplot3.html", include_plotlyjs="cdn", full_html=False)
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[5], line 5 2importpandasaspd 4# Prepare the data----> 5 df =eda.copy()
7# Define analytics jobs (Data Analyst + Business Analyst) 8defclassify_analytics_job(row):
NameError: name 'eda' is not defined
Code
import plotly.express as pximport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Calculate average years of experiencedf['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) /2# Clean the data (remove rows with missing salary or experience)df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])# Create the scatter plot with trend linefig = px.scatter(df, x='Avg_Years_Experience', y='SALARY', color='Job_Category', trendline='ols', # Add trend line (ordinary least squares) title='Experience Requirements vs Salary for Analytics Jobs', labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'}, color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), xaxis=dict( title="Average Years of Experience", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), yaxis=dict( title="Salary ($)", title_font=dict(size=16), tickfont=dict(size=12), gridcolor="#E2E8F0", linecolor="#2D3748", linewidth=2, showline=True, showgrid=True, zeroline=False ), legend=dict( title="Job Category", font=dict(size=13), bgcolor="#FFFFFF", bordercolor="#FF6B6B", # Red border for theme borderwidth=1, x=1.02, y=0.5, xanchor="left", yanchor="middle" ), hovermode="closest", hoverlabel=dict( bgcolor="#FFFFFF", font_size=12, font_family="Inter, sans-serif", font_color="#2D3748", bordercolor="#FF6B6B"# Red border for hover ))# Customize scatter pointsfig.update_traces( marker=dict( size=8, opacity=0.7, line=dict(width=1, color="#2D3748") ))fig.write_html("figures/edaplot4.html", include_plotlyjs="cdn", full_html=False)
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[6], line 5 2importpandasaspd 4# Prepare the data----> 5 df =eda.copy()
7# Define analytics jobs (Data Analyst + Business Analyst) 8defclassify_analytics_job(row):
NameError: name 'eda' is not defined
Code
import plotly.graph_objects as goimport pandas as pd# Prepare the datadf = eda.copy()# Define analytics jobs (Data Analyst + Business Analyst)def classify_analytics_job(row):if row['DATA_ANALYST_JOB']:returnTrue title =str(row['TITLE_NAME']).lower() if'TITLE_NAME'in row elsestr(row['TITLE']).lower()if'business analyst'in title:returnTruereturnFalsedf['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})# Filter for Analytics jobs onlydf_analytics = df[df['IS_ANALYTICS_JOB']].copy()# Clean the data (remove rows with missing industry)df_analytics = df_analytics.dropna(subset=['NAICS2_NAME'])# Group by job category and industry to get job countsdf_grouped = df_analytics.groupby(['Job_Category', 'NAICS2_NAME']).size().reset_index(name='Job_Count')# Shorten industry names for better readabilityshort_map = {'Professional, Scientific, and Technical Services': 'Prof. Services','Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt','Health Care and Social Assistance': 'Healthcare','Finance and Insurance': 'Finance','Information': 'Info Tech','Educational Services': 'Education','Manufacturing': 'Manufacturing','Retail Trade': 'Retail','Accommodation and Food Services': 'Hospitality','Other Services (except Public Administration)': 'Other Services'}df_grouped['NAICS2_NAME'] = df_grouped['NAICS2_NAME'].map(short_map).fillna(df_grouped['NAICS2_NAME'])# Prepare data for Sankey Diagram# Create a list of unique labels (nodes)labels =list(df_grouped['Job_Category'].unique()) +list(df_grouped['NAICS2_NAME'].unique())# Create source and target indicessource = [labels.index(job_cat) for job_cat in df_grouped['Job_Category']]target = [labels.index(industry) for industry in df_grouped['NAICS2_NAME']]value = df_grouped['Job_Count'].tolist()# Create the Sankey Diagramfig = go.Figure(data=[go.Sankey( node=dict( pad=15, thickness=20, line=dict(color="#2D3748", width=0.5), label=labels, color="#FF6B6B"# Red nodes for the theme ), link=dict( source=source, target=target, value=value, color="rgba(255, 107, 107, 0.5)"# Semi-transparent red links ))])# Beautify the layout with a red-white theme (no gradients)fig.update_layout( width=900, height=600, plot_bgcolor='#FFFFFF', # Plain white background paper_bgcolor='#FFFFFF', # Plain white background font=dict(family="Inter, sans-serif", size=14, color="#2D3748"), title=dict( text='Distribution of Analytics Job Postings by Industry', font=dict(size=24, color="#FF6B6B"), # Red title for theme x=0.5, xanchor="center", y=0.95, yanchor="top" ), margin=dict(l=20, r=20, t=80, b=20),)fig.write_html("figures/edaplot5.html", include_plotlyjs="cdn", full_html=False)
---------------------------------------------------------------------------NameError Traceback (most recent call last)
Cell In[7], line 5 2importpandasaspd 4# Prepare the data----> 5 df =eda.copy()
7# Define analytics jobs (Data Analyst + Business Analyst) 8defclassify_analytics_job(row):
NameError: name 'eda' is not defined